library(sparklyr)
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'dplyr':
  method           from
  print.rowwise_df     
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
sc <- spark_connect(master = "local", version = "2.4.5")
library(tidyverse)
── Attaching packages ────────────────────────────────────────────────────── tidyverse 1.3.0 ──
✓ ggplot2 3.3.0     ✓ purrr   0.3.4
✓ tibble  3.0.1     ✓ dplyr   0.8.5
✓ tidyr   1.1.0     ✓ stringr 1.4.0
✓ readr   1.3.1     ✓ forcats 0.5.0
── Conflicts ───────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x purrr::invoke() masks sparklyr::invoke()
x dplyr::lag()    masks stats::lag()
profiles <- spark_read_csv(
  sc, 
  "data/profiles.csv",
  escape = "\"",
  memory = FALSE,
  options = list(multiline = TRUE)
)

glimpse(profiles)
Rows: ??
Columns: 31
Database: spark_connection
$ age         <int> 47, 27, 45, 40, 33, 27, 20, 28, 24, 34, 32, 23, 19, 45, 32, 38, 36, 25, …
$ body_type   <chr> "athletic", "full figured", NA, "athletic", "athletic", "thin", "curvy",…
$ diet        <chr> "mostly anything", "mostly vegetarian", NA, NA, "mostly anything", "most…
$ drinks      <chr> "socially", "socially", "socially", "socially", "rarely", "socially", "s…
$ drugs       <chr> "never", NA, "never", "never", "never", "sometimes", "never", "never", "…
$ education   <chr> "graduated from college/university", "graduated from college/university"…
$ essay0      <chr> NA, "still figuring out what to put here... for now i'll leave you with\…
$ essay1      <chr> "working in a creative industry (brand marketing agency), helping to\nra…
$ essay2      <chr> "listening to people, hearing them and working to help them. it's\nwhat …
$ essay3      <chr> "my million dollar smile (self-deprecating humor).", "my smile, my energ…
$ essay4      <chr> "sadly, my reading is confined to newspapers and business\nperiodicals. …
$ essay5      <chr> "my daughter. usa today. exercise. some travel. world news. ice\ncream."…
$ essay6      <chr> NA, NA, "why i fill these blocks.<br />\nwhat i really, really want.<br …
$ essay7      <chr> "picking up my daughter for the weekend or going to the gym and\ncoming …
$ essay8      <chr> "i can be a procrastinator but it is often driven by my need to\nresearc…
$ essay9      <chr> "you are looking for an easy going, no drama, try anything once\namigo t…
$ ethnicity   <chr> "white", "white", "other", "white", "white", "hispanic / latin, white", …
$ height      <int> 72, 63, 66, 62, 74, 71, 68, 66, 62, 67, 61, 59, 68, 68, 67, 73, 75, 67, …
$ income      <int> -1, -1, -1, -1, -1, -1, 20000, -1, -1, -1, 70000, -1, -1, -1, -1, 70000,…
$ job         <chr> "sales / marketing / biz dev", "science / tech / engineering", "medicine…
$ last_online <chr> "2012-06-29-22-37", "2012-06-30-12-50", "2012-06-30-18-21", "2012-06-30-…
$ location    <chr> "san carlos, california", "oakland, california", "san francisco, califor…
$ offspring   <chr> "has a kid, but doesn&rsquo;t want more", "doesn&rsquo;t have kids", NA,…
$ orientation <chr> "straight", "straight", "straight", "straight", "gay", "gay", "straight"…
$ pets        <chr> "likes dogs", "likes dogs", "likes dogs and likes cats", "has dogs", NA,…
$ religion    <chr> "other", "atheism but not too serious about it", "catholicism but not to…
$ sex         <chr> "m", "f", "m", "f", "m", "m", "f", "f", "f", "f", "f", "f", "m", "m", "f…
$ sign        <chr> "scorpio and it&rsquo;s fun to think about", "sagittarius but it doesn&r…
$ smokes      <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", "yes",…
$ speaks      <chr> "english", "english", "english (fluently)", "english", "english", "engli…
$ status      <chr> "single", "single", "single", "single", "single", "single", "single", "s…
library(dplyr)
profiles %>%
  summarise_all(.funs = ~ sum(as.integer(is.na(.))))
Missing values are always removed in SQL.
Use `SUM(x, na.rm = TRUE)` to silence this warning
This warning is displayed only once per session.
profiles %>%
  summarise(num_neg_income = sum(as.integer(income < 0)))
profiles_char <- profiles %>%
  select(-c(age, income, height)) %>%
  mutate_all(~ ifelse(is.na(.), "missing", .))

profiles_num <- profiles %>%
  select(age, income, height) %>%
  mutate(
    age = as.numeric(age),
    income = ifelse(income == "-1", NA, as.numeric(income)),
    height = as.numeric(height)
  )

profiles <- sdf_bind_cols(profiles_char, profiles_num) %>%
  compute("profiles")
profiles %>%
  summarise_all(~sum(as.integer(is.na(.))))
Missing values are always removed in SQL.
Use `SUM(x, na.rm = TRUE)` to silence this warning
This warning is displayed only once per session.
profiles %>%
  summarise(num_neg_income = sum(as.integer(income < 0)))
glimpse(profiles)
Rows: ??
Columns: 31
Database: spark_connection
$ body_type   <chr> "thin", "curvy", "average", "fit", "athletic", "curvy", "fit", "average"…
$ diet        <chr> "mostly anything", "mostly anything", "strictly anything", "anything", "…
$ drinks      <chr> "socially", "socially", "often", "socially", "often", "socially", "socia…
$ drugs       <chr> "sometimes", "never", "sometimes", "never", "sometimes", "missing", "som…
$ education   <chr> "working on masters program", "working on college/university", "working …
$ essay0      <chr> "grad student at berkeley. i am moving to sf pretty soon and wanna\nmeet…
$ essay1      <chr> "doing my mba at berkeley and working part time. i also try to swim\neve…
$ essay2      <chr> "missing", "music. english. chemistry. dancing.", "drinking! .... i thin…
$ essay3      <chr> "i look younger than i am..", "my smile, because i am pretty much always…
$ essay4      <chr> "music: wow where do i start? radiohead, led zeppelin, pink floyd,\nphoe…
$ essay5      <chr> "- inspiring people<br />\n- swimming<br />\n- great food<br />\n- great…
$ essay6      <chr> "- how crazy life is<br />\n- finding a guy just like you (hopefully) (:…
$ essay7      <chr> "super happy", "either reading a book or seeing a live show at a local c…
$ essay8      <chr> "missing", "i can honestly say this, im a cryer.<br />\ni have the emoti…
$ essay9      <chr> "you want to meet me one of these days..", "you like a goofy sense of hu…
$ ethnicity   <chr> "hispanic / latin, white", "hispanic / latin", "white", "hispanic / lati…
$ job         <chr> "sales / marketing / biz dev", "hospitality / travel", "other", "artisti…
$ last_online <chr> "2012-06-23-08-25", "2012-04-19-12-59", "2012-01-11-22-11", "2012-06-30-…
$ location    <chr> "berkeley, california", "richmond, california", "burlingame, california"…
$ offspring   <chr> "doesn&rsquo;t have kids", "doesn&rsquo;t want kids", "doesn&rsquo;t wan…
$ orientation <chr> "gay", "straight", "straight", "gay", "straight", "bisexual", "straight"…
$ pets        <chr> "missing", "has dogs and dislikes cats", "has cats", "likes dogs", "has …
$ religion    <chr> "agnosticism", "agnosticism", "missing", "catholicism but not too seriou…
$ sex         <chr> "m", "f", "f", "m", "m", "f", "f", "m", "m", "m", "m", "f", "m", "m", "m…
$ sign        <chr> "libra", "leo", "aquarius but it doesn&rsquo;t matter", "scorpio and it&…
$ smokes      <chr> "no", "no", "yes", "when drinking", "no", "no", "no", "when drinking", "…
$ speaks      <chr> "english (fluently), portuguese (fluently), spanish (okay), french (okay…
$ status      <chr> "single", "single", "single", "available", "single", "single", "single",…
$ age         <dbl> 27, 20, 23, 19, 45, 19, 24, 48, 24, 26, 41, 33, 27, 55, 22, 26, 30, 26, …
$ income      <dbl> NaN, 2e+04, NaN, NaN, NaN, NaN, 4e+04, NaN, NaN, NaN, NaN, NaN, NaN, NaN…
$ height      <dbl> 71, 68, 59, 68, 68, 62, 65, 67, 72, 66, 67, 68, 68, 70, 66, 95, 74, 73, …

Now add in the response variable:

profiles <- profiles %>%
  mutate(
    not_working = as.integer(ifelse(job %in% c("student", "unemployed", "retired"), 1, 0))
  )

profiles %>%
  count(not_working)
The `add` argument of `group_by()` is deprecated as of dplyr 1.0.0.
Please use the `.add` argument instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.
profiles %>%
  filter(not_working == "1") %>%
  glimpse()
Rows: ??
Columns: 32
Database: spark_connection
$ body_type   <chr> "curvy", "athletic", "missing", "fit", "fit", "athletic", "missing", "av…
$ diet        <chr> "mostly vegetarian", "missing", "mostly anything", "mostly anything", "m…
$ drinks      <chr> "socially", "socially", "not at all", "socially", "socially", "socially"…
$ drugs       <chr> "missing", "never", "never", "never", "never", "never", "missing", "neve…
$ education   <chr> "working on college/university", "working on two-year college", "working…
$ essay0      <chr> "hello, gentlemen! i am a passionate little minx with a genuine lust\nfo…
$ essay1      <chr> "right now i am in university with a joint major in apparel design\nand …
$ essay2      <chr> "giving hugs<br />\nfigure drawing<br />\nde-awkwardifying situations<br…
$ essay3      <chr> "bright blue eyes, petite, dd cup, silly, bubbly, smiling, and a\nlight<…
$ essay4      <chr> "music: the specials, operation ivy, bad brains, the clash, choking\nvic…
$ essay5      <chr> "self-respect<br />\npure, honest human connection<br />\nsexy time<br /…
$ essay6      <chr> "past sexual encounters while in boring/mundane situations (people\nwond…
$ essay7      <chr> "hanging out with my roomie/best friend, being silly, cracking\njokes, h…
$ essay8      <chr> "i get sentimental about entries sent to postsecret.com. i hope to\nsend…
$ essay9      <chr> "you're kind, silly, fairly intelligent, don't take yourself too\nseriou…
$ ethnicity   <chr> "hispanic / latin, other", "asian", "asian", "white", "white", "white", …
$ job         <chr> "student", "student", "student", "student", "student", "retired", "stude…
$ last_online <chr> "2012-07-01-02-33", "2012-06-27-19-12", "2012-03-26-23-53", "2012-06-30-…
$ location    <chr> "san francisco, california", "redwood city, california", "moraga, califo…
$ offspring   <chr> "doesn&rsquo;t want kids", "doesn&rsquo;t have kids, and doesn&rsquo;t w…
$ orientation <chr> "bisexual", "straight", "straight", "bisexual", "straight", "straight", …
$ pets        <chr> "likes dogs and likes cats", "has dogs and likes cats", "has dogs", "lik…
$ religion    <chr> "atheism and laughing about it", "buddhism but not too serious about it"…
$ sex         <chr> "f", "f", "f", "m", "m", "f", "m", "m", "m", "f", "f", "f", "f", "f", "m…
$ sign        <chr> "virgo but it doesn&rsquo;t matter", "scorpio and it&rsquo;s fun to thin…
$ smokes      <chr> "no", "sometimes", "no", "no", "no", "no", "no", "sometimes", "no", "no"…
$ speaks      <chr> "english (fluently), spanish (okay)", "english (fluently), japanese (poo…
$ status      <chr> "single", "single", "single", "single", "single", "single", "single", "s…
$ age         <dbl> 19, 20, 20, 22, 22, 68, 23, 25, 34, 26, 34, 25, 23, 23, 25, 21, 23, 20, …
$ income      <dbl> NaN, NaN, NaN, 20000, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, …
$ height      <dbl> 62, 63, 63, 76, 73, 69, 74, 71, 73, 68, 63, 65, 68, 67, 65, 67, 62, 66, …
$ not_working <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
sdf_describe(profiles, cols = c("age", "height", "income", "not_working"))

Examples of manual feature engineering

splits <- profiles %>%
  sdf_random_split(training = 0.8, testing = 0.2)

training <- splits$training %>% 
  compute("training")
testing <- splits$testing %>% 
  compute("testing")
sdf_describe(training, cols = c("age", "height", "income", "not_working"))
sdf_describe(testing, cols = c("age", "height", "income", "not_working"))

We basically want the first word of the religion variable

training <- training %>%
  mutate(religion = regexp_extract(religion, "[a-zA-Z]+", 0))
training %>%
  count(religion, not_working) %>%
  group_by(religion) %>%
  summarise(
    count = sum(n),
    prop_not_working = sum(not_working * n) / sum(n)
  )

We suspect that people less likely to regularly drink are also less likely to regularly use drugs, let’s confirm this with a cross-tabulation (also known as a contingency table):

contingency <- training %>%
  sdf_crosstab("drinks", "drugs") %>%
  collect()

contingency

The contingency table is nice, but it would be better to order the levels of drinks and drugs to aid comparisons:

contingency %>%
  rename(drinks = drinks_drugs) %>%
  mutate(
    drinks = as_factor(drinks) %>% 
      fct_relevel("missing", "not at all", "rarely", "socially", "often",
                  "very often", "desperately")
  ) %>% 
  arrange(drinks) %>%
  select(drinks, missing, never, sometimes, often)

Ethnicity requires a bit of work: respondents on OKCupid were permitted to make multiple selections of ethnicity, and so we end up with potentially multiple comma-separated strings. Let’s have a look at possible values (we will use Hive string-handling functions split() and trim() together with explode() to do this):

ethnicities <- profiles %>%
  select(ethnicity) %>%
  mutate(ethnicity = explode(split(ethnicity, ","))) %>%
  mutate(ethnicity = trim(ethnicity)) %>%
  distinct(ethnicity) %>%
  collect()

ethnicities <- ethnicities$ethnicity
ethnicities
 [1] "pacific islander" "native american"  "white"            "missing"         
 [5] "black"            "middle eastern"   "indian"           "other"           
 [9] "hispanic / latin" "asian"           

Now we can use a bit of tidy-evaluation to create dummy columns corresponding to the possible ethnicities. First, we drop missing from the vector, as missing will correspond to zeroes all across the ethnicity dummies.

ethnicities <- setdiff(ethnicities, "missing")
ethnicities
[1] "pacific islander" "native american"  "white"            "black"           
[5] "middle eastern"   "indian"           "other"            "hispanic / latin"
[9] "asian"           
ethnicity_vars <- ethnicities %>% 
  purrr::map(~ expr(ifelse(like(ethnicity, !!.x), 1, 0))) %>%
  purrr::set_names(paste0("ethnicity_", gsub("\\s|/", "", ethnicities)))

training <- mutate(training, !!!ethnicity_vars) %>%
  compute("training")

glimpse(training)
Rows: ??
Columns: 41
Database: spark_connection
$ body_type                 <chr> "a little extra", "a little extra", "a little extra", "a l…
$ diet                      <chr> "anything", "anything", "anything", "anything", "anything"…
$ drinks                    <chr> "often", "rarely", "socially", "socially", "socially", "so…
$ drugs                     <chr> "missing", "never", "missing", "never", "never", "never", …
$ education                 <chr> "missing", "working on ph.d program", "graduated from coll…
$ essay0                    <chr> "im an impulsive creature i love life and having all types…
$ essay1                    <chr> "missing", "missing", "college graduate, i'm paying off my…
$ essay2                    <chr> "cooking i dont cook that often but when i do angels sing …
$ essay3                    <chr> "that im big and scary and that im alot nicer than i look"…
$ essay4                    <chr> "my favorite books are the wheel of time and sword of trut…
$ essay5                    <chr> "i could never do without friends, family, love,.self resp…
$ essay6                    <chr> "the end of the world, dragons,being a ninja,ruling the wo…
$ essay7                    <chr> "the same thing i do every night... try to take over the w…
$ essay8                    <chr> "i believe in ghosts", "missing", "i may have a slight obs…
$ essay9                    <chr> "you want to have fun", "missing", "you're down to earth, …
$ ethnicity                 <chr> "native american, hispanic / latin, white", "hispanic / la…
$ job                       <chr> "sales / marketing / biz dev", "science / tech / engineeri…
$ last_online               <chr> "2012-06-29-09-52", "2012-06-29-18-35", "2012-06-19-18-54"…
$ location                  <chr> "san rafael, california", "berkeley, california", "rodeo, …
$ offspring                 <chr> "missing", "missing", "doesn&rsquo;t have kids, but might …
$ orientation               <chr> "straight", "straight", "straight", "bisexual", "straight"…
$ pets                      <chr> "missing", "missing", "has dogs and likes cats", "likes do…
$ religion                  <chr> "christianity", "catholicism", "catholicism", "other", "mi…
$ sex                       <chr> "m", "m", "m", "m", "f", "m", "m", "m", "f", "m", "f", "m"…
$ sign                      <chr> "pisces and it&rsquo;s fun to think about", "missing", "aq…
$ smokes                    <chr> "yes", "no", "when drinking", "no", "missing", "sometimes"…
$ speaks                    <chr> "english", "english, spanish", "english (fluently), tagalo…
$ status                    <chr> "single", "single", "single", "available", "single", "sing…
$ age                       <dbl> 29, 29, 25, 45, 26, 20, 20, 26, 32, 26, 31, 36, 34, 36, 33…
$ income                    <dbl> 3e+04, NaN, 2e+04, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN,…
$ height                    <dbl> 73, 74, 63, 67, 63, 68, 75, 66, 61, 72, 62, 66, 73, 69, 64…
$ not_working               <int> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_pacificislander <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_nativeamerican  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_white           <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1…
$ ethnicity_black           <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
$ ethnicity_middleeastern   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_indian          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_other           <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_hispaniclatin   <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_asian           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

Let’s also get the length of the essays written

training <- training %>%
  mutate(
    essay_length = char_length(paste(!!!syms(paste0("essay", 0:9))))
  ) %>% 
  compute("training")

glimpse(training)
Rows: ??
Columns: 42
Database: spark_connection
$ body_type                 <chr> "a little extra", "a little extra", "a little extra", "a l…
$ diet                      <chr> "anything", "anything", "anything", "anything", "anything"…
$ drinks                    <chr> "often", "rarely", "socially", "socially", "socially", "so…
$ drugs                     <chr> "missing", "never", "missing", "never", "never", "never", …
$ education                 <chr> "missing", "working on ph.d program", "graduated from coll…
$ essay0                    <chr> "im an impulsive creature i love life and having all types…
$ essay1                    <chr> "missing", "missing", "college graduate, i'm paying off my…
$ essay2                    <chr> "cooking i dont cook that often but when i do angels sing …
$ essay3                    <chr> "that im big and scary and that im alot nicer than i look"…
$ essay4                    <chr> "my favorite books are the wheel of time and sword of trut…
$ essay5                    <chr> "i could never do without friends, family, love,.self resp…
$ essay6                    <chr> "the end of the world, dragons,being a ninja,ruling the wo…
$ essay7                    <chr> "the same thing i do every night... try to take over the w…
$ essay8                    <chr> "i believe in ghosts", "missing", "i may have a slight obs…
$ essay9                    <chr> "you want to have fun", "missing", "you're down to earth, …
$ ethnicity                 <chr> "native american, hispanic / latin, white", "hispanic / la…
$ job                       <chr> "sales / marketing / biz dev", "science / tech / engineeri…
$ last_online               <chr> "2012-06-29-09-52", "2012-06-29-18-35", "2012-06-19-18-54"…
$ location                  <chr> "san rafael, california", "berkeley, california", "rodeo, …
$ offspring                 <chr> "missing", "missing", "doesn&rsquo;t have kids, but might …
$ orientation               <chr> "straight", "straight", "straight", "bisexual", "straight"…
$ pets                      <chr> "missing", "missing", "has dogs and likes cats", "likes do…
$ religion                  <chr> "christianity", "catholicism", "catholicism", "other", "mi…
$ sex                       <chr> "m", "m", "m", "m", "f", "m", "m", "m", "f", "m", "f", "m"…
$ sign                      <chr> "pisces and it&rsquo;s fun to think about", "missing", "aq…
$ smokes                    <chr> "yes", "no", "when drinking", "no", "missing", "sometimes"…
$ speaks                    <chr> "english", "english, spanish", "english (fluently), tagalo…
$ status                    <chr> "single", "single", "single", "available", "single", "sing…
$ age                       <dbl> 29, 29, 25, 45, 26, 20, 20, 26, 32, 26, 31, 36, 34, 36, 33…
$ income                    <dbl> 3e+04, NaN, 2e+04, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN,…
$ height                    <dbl> 73, 74, 63, 67, 63, 68, 75, 66, 61, 72, 62, 66, 73, 69, 64…
$ not_working               <int> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_pacificislander <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_nativeamerican  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_white           <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1…
$ ethnicity_black           <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
$ ethnicity_middleeastern   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_indian          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_other           <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_hispaniclatin   <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ethnicity_asian           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ essay_length              <int> 892, 79, 2570, 4240, 2945, 2055, 2746, 1837, 2681, 560, 97…

Let’s see how to scale a variable manually:

scale_age_values <- training %>%
  summarize(
    mean_age = mean(age),
    sd_age = sd(age)
  ) %>%
  collect()
Missing values are always removed in SQL.
Use `mean(x, na.rm = TRUE)` to silence this warning
This warning is displayed only once per session.
scale_age_values
training <- training %>%
  mutate(
    scaled_age = (age - !!scale_age_values$mean_age) / !!scale_age_values$sd_age
  )
training

Fitting a model

lr <- ml_logistic_regression(
  training, not_working ~ scaled_age + sex + drinks + drugs + essay_length + status + diet + education
)

validation_info <- ml_evaluate(lr, training)
validation_info
BinaryLogisticRegressionSummaryImpl 
 Access the following via `$` or `ml_summary()`. 
 - features_col() 
 - label_col() 
 - predictions() 
 - probability_col() 
 - area_under_roc() 
 - f_measure_by_threshold() 
 - pr() 
 - precision_by_threshold() 
 - recall_by_threshold() 
 - roc() 
 - prediction_col() 
 - accuracy() 
 - f_measure_by_label() 
 - false_positive_rate_by_label() 
 - labels() 
 - precision_by_label() 
 - recall_by_label() 
 - true_positive_rate_by_label() 
 - weighted_f_measure() 
 - weighted_false_positive_rate() 
 - weighted_precision() 
 - weighted_recall() 
 - weighted_true_positive_rate() 
roc <- validation_info$roc() %>% 
  collect()

ggplot(roc, aes(x = FPR, y = TPR)) +
  geom_line() + geom_abline(lty = "dashed") +
  coord_fixed()

validation_info$area_under_roc()
[1] 0.875979

Apply model to test set

scale_age_values <- testing %>%
  summarize(
    mean_age = mean(age),
    sd_age = sd(age)
  ) %>%
  collect()

testing <- testing %>%
  mutate(
    essay_length = char_length(paste(!!!syms(paste0("essay", 0:9)))),
    scaled_age = (age - !!scale_age_values$mean_age) / !!scale_age_values$sd_age
  ) %>%
  compute("testing")

validation_info_test <- ml_evaluate(lr, testing)
validation_info_test
BinaryLogisticRegressionSummaryImpl 
 Access the following via `$` or `ml_summary()`. 
 - features_col() 
 - label_col() 
 - predictions() 
 - probability_col() 
 - area_under_roc() 
 - f_measure_by_threshold() 
 - pr() 
 - precision_by_threshold() 
 - recall_by_threshold() 
 - roc() 
 - prediction_col() 
 - accuracy() 
 - f_measure_by_label() 
 - false_positive_rate_by_label() 
 - labels() 
 - precision_by_label() 
 - recall_by_label() 
 - true_positive_rate_by_label() 
 - weighted_f_measure() 
 - weighted_false_positive_rate() 
 - weighted_precision() 
 - weighted_recall() 
 - weighted_true_positive_rate() 
roc <- validation_info_test$roc() %>% 
  collect()

ggplot(roc, aes(x = FPR, y = TPR)) +
  geom_line() + geom_abline(lty = "dashed") +
  coord_fixed()


validation_info_test$area_under_roc()
[1] 0.883113
spark_write_parquet(training, "training_data")
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCmBgYHtyfQpsaWJyYXJ5KHNwYXJrbHlyKQpzYyA8LSBzcGFya19jb25uZWN0KG1hc3RlciA9ICJsb2NhbCIsIHZlcnNpb24gPSAiMi40LjUiKQpgYGAKCmBgYHtyfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKcHJvZmlsZXMgPC0gc3BhcmtfcmVhZF9jc3YoCiAgc2MsIAogICJkYXRhL3Byb2ZpbGVzLmNzdiIsCiAgZXNjYXBlID0gIlwiIiwKICBtZW1vcnkgPSBGQUxTRSwKICBvcHRpb25zID0gbGlzdChtdWx0aWxpbmUgPSBUUlVFKQopCgpnbGltcHNlKHByb2ZpbGVzKQpgYGAKCmBgYHtyfQpsaWJyYXJ5KGRwbHlyKQpwcm9maWxlcyAlPiUKICBzdW1tYXJpc2VfYWxsKC5mdW5zID0gfiBzdW0oYXMuaW50ZWdlcihpcy5uYSguKSkpKQoKcHJvZmlsZXMgJT4lCiAgc3VtbWFyaXNlKG51bV9uZWdfaW5jb21lID0gc3VtKGFzLmludGVnZXIoaW5jb21lIDwgMCkpKQpgYGAKCmBgYHtyfQpwcm9maWxlc19jaGFyIDwtIHByb2ZpbGVzICU+JQogIHNlbGVjdCgtYyhhZ2UsIGluY29tZSwgaGVpZ2h0KSkgJT4lCiAgbXV0YXRlX2FsbCh+IGlmZWxzZShpcy5uYSguKSwgIm1pc3NpbmciLCAuKSkKCnByb2ZpbGVzX251bSA8LSBwcm9maWxlcyAlPiUKICBzZWxlY3QoYWdlLCBpbmNvbWUsIGhlaWdodCkgJT4lCiAgbXV0YXRlKAogICAgYWdlID0gYXMubnVtZXJpYyhhZ2UpLAogICAgaW5jb21lID0gaWZlbHNlKGluY29tZSA9PSAiLTEiLCBOQSwgYXMubnVtZXJpYyhpbmNvbWUpKSwKICAgIGhlaWdodCA9IGFzLm51bWVyaWMoaGVpZ2h0KQogICkKCnByb2ZpbGVzIDwtIHNkZl9iaW5kX2NvbHMocHJvZmlsZXNfY2hhciwgcHJvZmlsZXNfbnVtKSAlPiUKICBjb21wdXRlKCJwcm9maWxlcyIpCmBgYAoKYGBge3J9CnByb2ZpbGVzICU+JQogIHN1bW1hcmlzZV9hbGwofnN1bShhcy5pbnRlZ2VyKGlzLm5hKC4pKSkpCgpwcm9maWxlcyAlPiUKICBzdW1tYXJpc2UobnVtX25lZ19pbmNvbWUgPSBzdW0oYXMuaW50ZWdlcihpbmNvbWUgPCAwKSkpCmBgYAoKYGBge3J9CmdsaW1wc2UocHJvZmlsZXMpCmBgYAoKTm93IGFkZCBpbiB0aGUgcmVzcG9uc2UgdmFyaWFibGU6CgpgYGB7cn0KcHJvZmlsZXMgPC0gcHJvZmlsZXMgJT4lCiAgbXV0YXRlKAogICAgbm90X3dvcmtpbmcgPSBhcy5pbnRlZ2VyKGlmZWxzZShqb2IgJWluJSBjKCJzdHVkZW50IiwgInVuZW1wbG95ZWQiLCAicmV0aXJlZCIpLCAxLCAwKSkKICApCgpwcm9maWxlcyAlPiUKICBjb3VudChub3Rfd29ya2luZykKYGBgCgpgYGB7cn0KcHJvZmlsZXMgJT4lCiAgZmlsdGVyKG5vdF93b3JraW5nID09ICIxIikgJT4lCiAgZ2xpbXBzZSgpCmBgYAoKCmBgYHtyfQpzZGZfZGVzY3JpYmUocHJvZmlsZXMsIGNvbHMgPSBjKCJhZ2UiLCAiaGVpZ2h0IiwgImluY29tZSIsICJub3Rfd29ya2luZyIpKQpgYGAKCiMjIEV4YW1wbGVzIG9mIG1hbnVhbCBmZWF0dXJlIGVuZ2luZWVyaW5nCgpgYGB7cn0Kc3BsaXRzIDwtIHByb2ZpbGVzICU+JQogIHNkZl9yYW5kb21fc3BsaXQodHJhaW5pbmcgPSAwLjgsIHRlc3RpbmcgPSAwLjIpCgp0cmFpbmluZyA8LSBzcGxpdHMkdHJhaW5pbmcgJT4lIAogIGNvbXB1dGUoInRyYWluaW5nIikKdGVzdGluZyA8LSBzcGxpdHMkdGVzdGluZyAlPiUgCiAgY29tcHV0ZSgidGVzdGluZyIpCmBgYAoKYGBge3J9CnNkZl9kZXNjcmliZSh0cmFpbmluZywgY29scyA9IGMoImFnZSIsICJoZWlnaHQiLCAiaW5jb21lIiwgIm5vdF93b3JraW5nIikpCmBgYAoKYGBge3J9CnNkZl9kZXNjcmliZSh0ZXN0aW5nLCBjb2xzID0gYygiYWdlIiwgImhlaWdodCIsICJpbmNvbWUiLCAibm90X3dvcmtpbmciKSkKYGBgCgpXZSBiYXNpY2FsbHkgd2FudCB0aGUgZmlyc3Qgd29yZCBvZiB0aGUgYHJlbGlnaW9uYCB2YXJpYWJsZQoKYGBge3J9CnRyYWluaW5nIDwtIHRyYWluaW5nICU+JQogIG11dGF0ZShyZWxpZ2lvbiA9IHJlZ2V4cF9leHRyYWN0KHJlbGlnaW9uLCAiW2EtekEtWl0rIiwgMCkpCmBgYAoKYGBge3J9CnRyYWluaW5nICU+JQogIGNvdW50KHJlbGlnaW9uLCBub3Rfd29ya2luZykgJT4lCiAgZ3JvdXBfYnkocmVsaWdpb24pICU+JQogIHN1bW1hcmlzZSgKICAgIGNvdW50ID0gc3VtKG4pLAogICAgcHJvcF9ub3Rfd29ya2luZyA9IHN1bShub3Rfd29ya2luZyAqIG4pIC8gc3VtKG4pCiAgKQpgYGAKV2Ugc3VzcGVjdCB0aGF0IHBlb3BsZSBsZXNzIGxpa2VseSB0byByZWd1bGFybHkgZHJpbmsgYXJlIGFsc28gbGVzcyBsaWtlbHkgdG8gcmVndWxhcmx5IHVzZSBkcnVncywgbGV0J3MgY29uZmlybSB0aGlzIHdpdGggYSBjcm9zcy10YWJ1bGF0aW9uIChhbHNvIGtub3duIGFzIGEgY29udGluZ2VuY3kgdGFibGUpOgoKYGBge3J9CmNvbnRpbmdlbmN5IDwtIHRyYWluaW5nICU+JQogIHNkZl9jcm9zc3RhYigiZHJpbmtzIiwgImRydWdzIikgJT4lCiAgY29sbGVjdCgpCgpjb250aW5nZW5jeQpgYGAKClRoZSBjb250aW5nZW5jeSB0YWJsZSBpcyBuaWNlLCBidXQgaXQgd291bGQgYmUgYmV0dGVyIHRvIG9yZGVyIHRoZSBsZXZlbHMgb2YgYGRyaW5rc2AgYW5kIGBkcnVnc2AgdG8gYWlkIGNvbXBhcmlzb25zOgoKYGBge3J9CmNvbnRpbmdlbmN5ICU+JQogIHJlbmFtZShkcmlua3MgPSBkcmlua3NfZHJ1Z3MpICU+JQogIG11dGF0ZSgKICAgIGRyaW5rcyA9IGFzX2ZhY3Rvcihkcmlua3MpICU+JSAKICAgICAgZmN0X3JlbGV2ZWwoIm1pc3NpbmciLCAibm90IGF0IGFsbCIsICJyYXJlbHkiLCAic29jaWFsbHkiLCAib2Z0ZW4iLAogICAgICAgICAgICAgICAgICAidmVyeSBvZnRlbiIsICJkZXNwZXJhdGVseSIpCiAgKSAlPiUgCiAgYXJyYW5nZShkcmlua3MpICU+JQogIHNlbGVjdChkcmlua3MsIG1pc3NpbmcsIG5ldmVyLCBzb21ldGltZXMsIG9mdGVuKQpgYGAKCkV0aG5pY2l0eSByZXF1aXJlcyBhIGJpdCBvZiB3b3JrOiByZXNwb25kZW50cyBvbiBPS0N1cGlkIHdlcmUgcGVybWl0dGVkIHRvIG1ha2UgbXVsdGlwbGUgc2VsZWN0aW9ucyBvZiBldGhuaWNpdHksIGFuZCBzbyB3ZSBlbmQgdXAgd2l0aCBwb3RlbnRpYWxseSBtdWx0aXBsZSBjb21tYS1zZXBhcmF0ZWQgc3RyaW5ncy4gTGV0J3MgaGF2ZSBhIGxvb2sgYXQgcG9zc2libGUgdmFsdWVzICh3ZSB3aWxsIHVzZSBIaXZlIHN0cmluZy1oYW5kbGluZyBmdW5jdGlvbnMgYHNwbGl0KClgIGFuZCBgdHJpbSgpYCB0b2dldGhlciB3aXRoIGBleHBsb2RlKClgIHRvIGRvIHRoaXMpOiAKCmBgYHtyfQpldGhuaWNpdGllcyA8LSBwcm9maWxlcyAlPiUKICBzZWxlY3QoZXRobmljaXR5KSAlPiUKICBtdXRhdGUoZXRobmljaXR5ID0gZXhwbG9kZShzcGxpdChldGhuaWNpdHksICIsIikpKSAlPiUKICBtdXRhdGUoZXRobmljaXR5ID0gdHJpbShldGhuaWNpdHkpKSAlPiUKICBkaXN0aW5jdChldGhuaWNpdHkpICU+JQogIGNvbGxlY3QoKQoKZXRobmljaXRpZXMgPC0gZXRobmljaXRpZXMkZXRobmljaXR5CmV0aG5pY2l0aWVzCmBgYAoKTm93IHdlIGNhbiB1c2UgYSBiaXQgb2YgdGlkeS1ldmFsdWF0aW9uIHRvIGNyZWF0ZSBkdW1teSBjb2x1bW5zIGNvcnJlc3BvbmRpbmcgdG8gdGhlIHBvc3NpYmxlIGV0aG5pY2l0aWVzLiBGaXJzdCwgd2UgZHJvcCBgbWlzc2luZ2AgZnJvbSB0aGUgdmVjdG9yLCBhcyBgbWlzc2luZ2Agd2lsbCBjb3JyZXNwb25kIHRvIHplcm9lcyBhbGwgYWNyb3NzIHRoZSBldGhuaWNpdHkgZHVtbWllcy4KCmBgYHtyfQpldGhuaWNpdGllcyA8LSBzZXRkaWZmKGV0aG5pY2l0aWVzLCAibWlzc2luZyIpCmV0aG5pY2l0aWVzCmBgYAoKYGBge3J9CmV0aG5pY2l0eV92YXJzIDwtIGV0aG5pY2l0aWVzICU+JSAKICBwdXJycjo6bWFwKH4gZXhwcihpZmVsc2UobGlrZShldGhuaWNpdHksICEhLngpLCAxLCAwKSkpICU+JQogIHB1cnJyOjpzZXRfbmFtZXMocGFzdGUwKCJldGhuaWNpdHlfIiwgZ3N1YigiXFxzfC8iLCAiIiwgZXRobmljaXRpZXMpKSkKCnRyYWluaW5nIDwtIG11dGF0ZSh0cmFpbmluZywgISEhZXRobmljaXR5X3ZhcnMpICU+JQogIGNvbXB1dGUoInRyYWluaW5nIikKCmdsaW1wc2UodHJhaW5pbmcpCmBgYAoKTGV0J3MgYWxzbyBnZXQgdGhlIGxlbmd0aCBvZiB0aGUgZXNzYXlzIHdyaXR0ZW4gCgpgYGB7cn0KdHJhaW5pbmcgPC0gdHJhaW5pbmcgJT4lCiAgbXV0YXRlKAogICAgZXNzYXlfbGVuZ3RoID0gY2hhcl9sZW5ndGgocGFzdGUoISEhc3ltcyhwYXN0ZTAoImVzc2F5IiwgMDo5KSkpKQogICkgJT4lIAogIGNvbXB1dGUoInRyYWluaW5nIikKCmdsaW1wc2UodHJhaW5pbmcpCmBgYAoKTGV0J3Mgc2VlIGhvdyB0byBzY2FsZSBhIHZhcmlhYmxlIG1hbnVhbGx5OgoKYGBge3J9CnNjYWxlX2FnZV92YWx1ZXMgPC0gdHJhaW5pbmcgJT4lCiAgc3VtbWFyaXplKAogICAgbWVhbl9hZ2UgPSBtZWFuKGFnZSksCiAgICBzZF9hZ2UgPSBzZChhZ2UpCiAgKSAlPiUKICBjb2xsZWN0KCkKCnNjYWxlX2FnZV92YWx1ZXMKYGBgCgpgYGB7cn0KdHJhaW5pbmcgPC0gdHJhaW5pbmcgJT4lCiAgbXV0YXRlKAogICAgc2NhbGVkX2FnZSA9IChhZ2UgLSAhIXNjYWxlX2FnZV92YWx1ZXMkbWVhbl9hZ2UpIC8gISFzY2FsZV9hZ2VfdmFsdWVzJHNkX2FnZQogICkKYGBgCgpgYGB7cn0KdHJhaW5pbmcKYGBgCgojIyBGaXR0aW5nIGEgbW9kZWwKCgpgYGB7cn0KbHIgPC0gbWxfbG9naXN0aWNfcmVncmVzc2lvbigKICB0cmFpbmluZywgbm90X3dvcmtpbmcgfiBzY2FsZWRfYWdlICsgc2V4ICsgZHJpbmtzICsgZHJ1Z3MgKyBlc3NheV9sZW5ndGggKyBzdGF0dXMgKyBkaWV0ICsgZWR1Y2F0aW9uCikKCnZhbGlkYXRpb25faW5mbyA8LSBtbF9ldmFsdWF0ZShsciwgdHJhaW5pbmcpCnZhbGlkYXRpb25faW5mbwpgYGAKCgpgYGB7cn0Kcm9jIDwtIHZhbGlkYXRpb25faW5mbyRyb2MoKSAlPiUgCiAgY29sbGVjdCgpCgpnZ3Bsb3Qocm9jLCBhZXMoeCA9IEZQUiwgeSA9IFRQUikpICsKICBnZW9tX2xpbmUoKSArIGdlb21fYWJsaW5lKGx0eSA9ICJkYXNoZWQiKSArCiAgY29vcmRfZml4ZWQoKQpgYGAKCmBgYHtyfQp2YWxpZGF0aW9uX2luZm8kYXJlYV91bmRlcl9yb2MoKQpgYGAKCiMjIEFwcGx5IG1vZGVsIHRvIHRlc3Qgc2V0CgpgYGB7cn0Kc2NhbGVfYWdlX3ZhbHVlcyA8LSB0ZXN0aW5nICU+JQogIHN1bW1hcml6ZSgKICAgIG1lYW5fYWdlID0gbWVhbihhZ2UpLAogICAgc2RfYWdlID0gc2QoYWdlKQogICkgJT4lCiAgY29sbGVjdCgpCgp0ZXN0aW5nIDwtIHRlc3RpbmcgJT4lCiAgbXV0YXRlKAogICAgZXNzYXlfbGVuZ3RoID0gY2hhcl9sZW5ndGgocGFzdGUoISEhc3ltcyhwYXN0ZTAoImVzc2F5IiwgMDo5KSkpKSwKICAgIHNjYWxlZF9hZ2UgPSAoYWdlIC0gISFzY2FsZV9hZ2VfdmFsdWVzJG1lYW5fYWdlKSAvICEhc2NhbGVfYWdlX3ZhbHVlcyRzZF9hZ2UKICApICU+JQogIGNvbXB1dGUoInRlc3RpbmciKQoKdmFsaWRhdGlvbl9pbmZvX3Rlc3QgPC0gbWxfZXZhbHVhdGUobHIsIHRlc3RpbmcpCnZhbGlkYXRpb25faW5mb190ZXN0Cgpyb2MgPC0gdmFsaWRhdGlvbl9pbmZvX3Rlc3Qkcm9jKCkgJT4lIAogIGNvbGxlY3QoKQoKZ2dwbG90KHJvYywgYWVzKHggPSBGUFIsIHkgPSBUUFIpKSArCiAgZ2VvbV9saW5lKCkgKyBnZW9tX2FibGluZShsdHkgPSAiZGFzaGVkIikgKwogIGNvb3JkX2ZpeGVkKCkKCnZhbGlkYXRpb25faW5mb190ZXN0JGFyZWFfdW5kZXJfcm9jKCkKYGBgCgpgYGB7cn0Kc3Bhcmtfd3JpdGVfcGFycXVldCh0cmFpbmluZywgInRyYWluaW5nX2RhdGEiKQpgYGAKCgo=